package cc.owoo.godpen.analysis.html;

import cc.owoo.godpen.analysis.StringAnalysis;

/**
 * HTML字符串解析
 * Created by nimensei
 * 2022-06-01 下午 05:18
 */
public class HtmlStringAnalysis extends StringAnalysis {
    public HtmlStringAnalysis(char[] chars) {
        super(chars);
    }

    @Override
    public String nextString(char end) {
        StringBuilder string = new StringBuilder();
        while (hashNext()) {
            char c = next();
            if (c == end)
                break;
            if (c == '&') {
                int before = index + 1;
                while (hashNext()) {
                    c = next();
                    if (c <= 32 || c == end || c == ';')
                        break;
                }
                String name = substring(before, index);
                Character character = character(name);
                if (character != null) {
                    string.append(character);
                    if (c <= 32)
                        string.append(c);
                } else {
                    string.append("&").append(name);
                    if (c != end)
                        string.append(c);
                }
                if (c == end)
                    break;
                continue;
            }
            string.append(c);
        }
        return string.toString();
    }

    /**
     * 将特殊字符名称转换成原本的字符
     *
     * @param name 特殊字符名称，如果是编号，请在名称签名加上#
     * @return 字符，如果不能转换则返回null
     */
    public static Character character(String name) {
        return switch (name) {
            // ascii符号
            case "nbsp" -> '\u0020';// 空格
            case "excl" -> '\u0021';// !
            case "quot", "ldquo", "rdquo" -> '\u0022';// "
            case "num" -> '\u0023';// #
            case "dollar" -> '\u0024';// $
            case "percnt" -> '\u0025';// %
            case "amp" -> '\u0026';// &
            case "apos", "lsquo", "rsquo" -> '\'';// '
            case "lpar" -> '\u0028';// (
            case "rpar" -> '\u0029';// )
            case "ast" -> '\u002a';// *
            case "plus" -> '\u002b';// +
            case "comma" -> '\u002c';// ,
            case "hyphen" -> '\u002d';// -
            case "period" -> '\u002e';// .
            case "sol" -> '\u002f';// /
            case "colon" -> '\u003a';// :
            case "semi" -> '\u003b';// ;
            case "lt" -> '\u003c';// <
            case "equals" -> '\u003d';// =
            case "gt" -> '\u003e';// >
            case "quest" -> '\u003f';// ?
            case "commat" -> '\u0040';// @
            case "lsqb" -> '\u005b';// [
            case "bsol" -> '\\';// \
            case "rsqb" -> '\u005d';// ]
            case "circ" -> '\u005e';// ^
            case "lowbar" -> '\u005f';// _
            case "grave" -> '\u0060';// `
            case "lcub" -> '\u007b';// {
            case "verbar" -> '\u007c';// |
            case "rcub" -> '\u007d';// }
            case "tilde" -> '\u007e';// ~
            // 其他特殊符号
            case "le" -> '\u2264';// ≤
            case "ge" -> '\u2265';// ≥
            case "gl" -> '\u2277';// ≷
            case "cent" -> '\u00a2';// ¢
            case "pound" -> '\u00a3';// £
            case "yen" -> '\u00a5';// ¥
            case "sect" -> '\u00a7';// §
            case "copy" -> '\u00a9';// ©
            case "times" -> '\u00d7';// ×
            case "divide" -> '\u00F7';// ÷
            case "forall" -> '\u2200';// ∀
            case "part" -> '\u2202';// ∂
            case "exist" -> '\u2203';// ∃
            case "empty" -> '\u2205';// ∅
            case "nabla" -> '\u2207';// ∇
            case "isin" -> '\u2208';// ∈
            case "notin" -> '\u2209';// ∉
            case "ni" -> '\u220b';// ∋
            case "prod" -> '\u220f';// ∏
            case "sum" -> '\u2211';// ∑
            case "minus" -> '\u2212';// −
            case "lowast" -> '\u2217';// ∗
            case "radic" -> '\u221a';// √
            case "prop" -> '\u221d';// ∝
            case "infin" -> '\u221e';// ∞
            case "ang" -> '\u2220';// ∠
            case "and" -> '\u2227';// ∧
            case "or" -> '\u2228';// ∨
            case "cap" -> '\u2229';// ∩
            case "cup" -> '\u222a';// ∪
            case "int" -> '\u222b';// ∫
            case "there4" -> '\u2234';// ∴
            case "sim" -> '\u223c';// ∼
            case "cong" -> '\u2245';// ≅
            case "asymp" -> '\u2248';// ≈
            case "ne" -> '\u2260';// ≠
            case "equiv" -> '\u2261';// ≡
            case "sub" -> '\u2282';// ⊂
            case "sup" -> '\u2283';// ⊃
            case "nsub" -> '\u2284';// ⊄
            case "sube" -> '\u2286';// ⊆
            case "supe" -> '\u2287';// ⊇
            case "oplus" -> '\u2295';// ⊕
            case "otimes" -> '\u2297';// ⊗
            case "perp" -> '\u22a5';// ⊥
            case "sdot" -> '\u22c5';// ⋅
            case "Alpha" -> '\u0391';// Α
            case "Beta" -> '\u0392';// Β
            case "Gamma" -> '\u0393';// Γ
            case "Delta" -> '\u0394';// Δ
            case "Epsilon" -> '\u0395';// Ε
            case "Zeta" -> '\u0396';// Ζ
            case "Eta" -> '\u0397';// Η
            case "Theta" -> '\u0398';// Θ
            case "Iota" -> '\u0399';// Ι
            case "Kappa" -> '\u039a';// Κ
            case "Lambda" -> '\u039b';// Λ
            case "Mu" -> '\u039c';// Μ
            case "Nu" -> '\u039d';// Ν
            case "Xi" -> '\u039e';// Ξ
            case "Omicron" -> '\u039f';// Ο
            case "Pi" -> '\u03a0';// Π
            case "Rho" -> '\u03a1';// Ρ
            case "Sigma" -> '\u03a3';// Σ
            case "Tau" -> '\u03a4';// Τ
            case "Upsilon" -> '\u03a5';// Υ
            case "Phi" -> '\u03a6';// Φ
            case "Chi" -> '\u03a7';// Χ
            case "Psi" -> '\u03a8';// Ψ
            case "Omega" -> '\u03a9';// Ω
            case "alpha" -> '\u03b1';// α
            case "beta" -> '\u03b2';// β
            case "gamma" -> '\u03b3';// γ
            case "delta" -> '\u03b4';// δ
            case "epsilon" -> '\u03b5';// ε
            case "zeta" -> '\u03b6';// ζ
            case "eta" -> '\u03b7';// η
            case "theta" -> '\u03b8';// θ
            case "iota" -> '\u03b9';// ι
            case "kappa" -> '\u03ba';// κ
            case "lambda" -> '\u03bb';// λ
            case "mu" -> '\u03bc';// μ
            case "nu" -> '\u03bd';// ν
            case "xi" -> '\u03be';// ξ
            case "omicron" -> '\u03bf';// ο
            case "pi" -> '\u03c0';// π
            case "rho" -> '\u03c1';// ρ
            case "sigmaf" -> '\u03c2';// ς
            case "sigma" -> '\u03c3';// σ
            case "tau" -> '\u03c4';// τ
            case "upsilon" -> '\u03c5';// υ
            case "phi" -> '\u03c6';// φ
            case "chi" -> '\u03c7';// χ
            case "psi" -> '\u03c8';// ψ
            case "omega" -> '\u03c9';// ω
            case "thetasym" -> '\u03d1';// ϑ
            case "upsih" -> '\u03d2';// ϒ
            case "piv" -> '\u03d6';// ϖ
            case "OElig" -> '\u0152';// Œ
            case "oelig" -> '\u0153';// œ
            case "Scaron" -> '\u0160';// Š
            case "scaron" -> '\u0161';// š
            case "Yuml" -> '\u0178';// Ÿ
            case "fnof" -> '\u0192';// ƒ
            case "ensp" -> '\u2002';//  
            case "emsp" -> '\u2003';//  
            case "thinsp" -> '\u2009';//  
            case "zwnj" -> '\u200c';// ‌
            case "zwj" -> '\u200d';// ‍
            case "lrm" -> '\u200e';// ‎
            case "rlm" -> '\u200f';// ‏
            case "ndash" -> '\u2013';// –
            case "mdash" -> '\u2014';// —
            case "sbquo" -> '\u201a';// ‚
            case "bdquo" -> '\u201e';// „
            case "dagger" -> '\u2020';// †
            case "Dagger" -> '\u2021';// ‡
            case "bull" -> '\u2022';// •
            case "hellip" -> '\u2026';// …
            case "permil" -> '\u2030';// ‰
            case "prime" -> '\u2032';// ′
            case "Prime" -> '\u2033';// ″
            case "lsaquo" -> '\u2039';// ‹
            case "rsaquo" -> '\u203a';// ›
            case "oline" -> '\u203e';// ‾
            case "euro" -> '\u20ac';// €
            case "larr" -> '\u2190';// ←
            case "uarr" -> '\u2191';// ↑
            case "rarr" -> '\u2192';// →
            case "darr" -> '\u2193';// ↓
            case "harr" -> '\u2194';// ↔
            case "crarr" -> '\u21b5';// ↵
            case "lceil" -> '\u2308';// ⌈
            case "rceil" -> '\u2309';// ⌉
            case "lfloor" -> '\u230a';// ⌊
            case "rfloor" -> '\u230b';// ⌋
            case "loz" -> '\u25ca';// ◊
            case "spades" -> '\u2660';// ♠
            case "clubs" -> '\u2663';// ♣
            case "hearts" -> '\u2665';// ♥
            case "diams" -> '\u2666';// ♦
            default -> null;
        };
    }

    /**
     * 获取特殊字符的名称
     *
     * @param character 需要转换的特殊字符
     * @return 字符的名称，如果不能转换则返回字null
     */
    public static String characterName(char character) {
        return switch (character) {
            // ascii符号
            case '\u0020' -> "nbsp";// 空格
            case '\u0021' -> "excl";// !
            case '\u0022' -> "quot";// "
            case '\u0023' -> "num";// #
            case '\u0024' -> "dollar";// $
            case '\u0025' -> "percnt";// %
            case '\u0026' -> "amp";// &
            case '\'' -> "apos";// '
            case '\u0028' -> "lpar";// (
            case '\u0029' -> "rpar";// )
            case '\u002a' -> "ast";// *
            case '\u002b' -> "plus";// +
            case '\u002c' -> "comma";// ,
            case '\u002d' -> "hyphen";// -
            case '\u002e' -> "period";// .
            case '\u002f' -> "sol";// /
            case '\u003a' -> "colon";// :
            case '\u003b' -> "semi";// ;
            case '\u003c' -> "lt";// <
            case '\u003d' -> "equals";// =
            case '\u003e' -> "gt";// >
            case '\u003f' -> "quest";// ?
            case '\u0040' -> "commat";// @
            case '\u005b' -> "lsqb";// [
            case '\\' -> "bsol";// \
            case '\u005d' -> "rsqb";// ]
            case '\u005e' -> "circ";// ^
            case '\u005f' -> "lowbar";// _
            case '\u0060' -> "grave";// `
            case '\u007b' -> "lcub";// {
            case '\u007c' -> "verbar";// |
            case '\u007d' -> "rcub";// }
            case '\u007e' -> "tilde";// ~
            case '\u2264' -> "le";// ≤
            case '\u2265' -> "ge";// ≥
            case '\u2277' -> "gl";// ≷
            case '\u00a2' -> "cent";// ¢
            case '\u00a3' -> "pound";// £
            case '\u00a5' -> "yen";// ¥
            case '\u00a7' -> "sect";// §
            case '\u00a9' -> "copy";// ©
            case '\u00d7' -> "times";// ×
            case '\u00F7' -> "divide";// ÷
            case '\u2200' -> "forall";// ∀
            case '\u2202' -> "part";// ∂
            case '\u2203' -> "exist";// ∃
            case '\u2205' -> "empty";// ∅
            case '\u2207' -> "nabla";// ∇
            case '\u2208' -> "isin";// ∈
            case '\u2209' -> "notin";// ∉
            case '\u220b' -> "ni";// ∋
            case '\u220f' -> "prod";// ∏
            case '\u2211' -> "sum";// ∑
            case '\u2212' -> "minus";// −
            case '\u2217' -> "lowast";// ∗
            case '\u221a' -> "radic";// √
            case '\u221d' -> "prop";// ∝
            case '\u221e' -> "infin";// ∞
            case '\u2220' -> "ang";// ∠
            case '\u2227' -> "and";// ∧
            case '\u2228' -> "or";// ∨
            case '\u2229' -> "cap";// ∩
            case '\u222a' -> "cup";// ∪
            case '\u222b' -> "int";// ∫
            case '\u2234' -> "there4";// ∴
            case '\u223c' -> "sim";// ∼
            case '\u2245' -> "cong";// ≅
            case '\u2248' -> "asymp";// ≈
            case '\u2260' -> "ne";// ≠
            case '\u2261' -> "equiv";// ≡
            case '\u2282' -> "sub";// ⊂
            case '\u2283' -> "sup";// ⊃
            case '\u2284' -> "nsub";// ⊄
            case '\u2286' -> "sube";// ⊆
            case '\u2287' -> "supe";// ⊇
            case '\u2295' -> "oplus";// ⊕
            case '\u2297' -> "otimes";// ⊗
            case '\u22a5' -> "perp";// ⊥
            case '\u22c5' -> "sdot";// ⋅
            case '\u0391' -> "Alpha";// Α
            case '\u0392' -> "Beta";// Β
            case '\u0393' -> "Gamma";// Γ
            case '\u0394' -> "Delta";// Δ
            case '\u0395' -> "Epsilon";// Ε
            case '\u0396' -> "Zeta";// Ζ
            case '\u0397' -> "Eta";// Η
            case '\u0398' -> "Theta";// Θ
            case '\u0399' -> "Iota";// Ι
            case '\u039a' -> "Kappa";// Κ
            case '\u039b' -> "Lambda";// Λ
            case '\u039c' -> "Mu";// Μ
            case '\u039d' -> "Nu";// Ν
            case '\u039e' -> "Xi";// Ξ
            case '\u039f' -> "Omicron";// Ο
            case '\u03a0' -> "Pi";// Π
            case '\u03a1' -> "Rho";// Ρ
            case '\u03a3' -> "Sigma";// Σ
            case '\u03a4' -> "Tau";// Τ
            case '\u03a5' -> "Upsilon";// Υ
            case '\u03a6' -> "Phi";// Φ
            case '\u03a7' -> "Chi";// Χ
            case '\u03a8' -> "Psi";// Ψ
            case '\u03a9' -> "Omega";// Ω
            case '\u03b1' -> "alpha";// α
            case '\u03b2' -> "beta";// β
            case '\u03b3' -> "gamma";// γ
            case '\u03b4' -> "delta";// δ
            case '\u03b5' -> "epsilon";// ε
            case '\u03b6' -> "zeta";// ζ
            case '\u03b7' -> "eta";// η
            case '\u03b8' -> "theta";// θ
            case '\u03b9' -> "iota";// ι
            case '\u03ba' -> "kappa";// κ
            case '\u03bb' -> "lambda";// λ
            case '\u03bc' -> "mu";// μ
            case '\u03bd' -> "nu";// ν
            case '\u03be' -> "xi";// ξ
            case '\u03bf' -> "omicron";// ο
            case '\u03c0' -> "pi";// π
            case '\u03c1' -> "rho";// ρ
            case '\u03c2' -> "sigmaf";// ς
            case '\u03c3' -> "sigma";// σ
            case '\u03c4' -> "tau";// τ
            case '\u03c5' -> "upsilon";// υ
            case '\u03c6' -> "phi";// φ
            case '\u03c7' -> "chi";// χ
            case '\u03c8' -> "psi";// ψ
            case '\u03c9' -> "omega";// ω
            case '\u03d1' -> "thetasym";// ϑ
            case '\u03d2' -> "upsih";// ϒ
            case '\u03d6' -> "piv";// ϖ
            case '\u0152' -> "OElig";// Œ
            case '\u0153' -> "oelig";// œ
            case '\u0160' -> "Scaron";// Š
            case '\u0161' -> "scaron";// š
            case '\u0178' -> "Yuml";// Ÿ
            case '\u0192' -> "fnof";// ƒ
            case '\u2002' -> "ensp";//  
            case '\u2003' -> "emsp";//  
            case '\u2009' -> "thinsp";//  
            case '\u200c' -> "zwnj";// ‌
            case '\u200d' -> "zwj";// ‍
            case '\u200e' -> "lrm";// ‎
            case '\u200f' -> "rlm"; // ‏
            case '\u2013' -> "ndash";// –
            case '\u2014' -> "mdash";// —
            case '\u201a' -> "sbquo";// ‚
            case '\u201e' -> "bdquo";// „
            case '\u2020' -> "dagger";// †
            case '\u2021' -> "Dagger";// ‡
            case '\u2022' -> "bull";// •
            case '\u2026' -> "hellip";// …
            case '\u2030' -> "permil";// ‰
            case '\u2032' -> "prime";// ′
            case '\u2033' -> "Prime";// ″
            case '\u2039' -> "lsaquo";// ‹
            case '\u203a' -> "rsaquo";// ›
            case '\u203e' -> "oline";// ‾
            case '\u20ac' -> "euro";// €
            case '\u2190' -> "larr";// ←
            case '\u2191' -> "uarr";// ↑
            case '\u2192' -> "rarr";// →
            case '\u2193' -> "darr";// ↓
            case '\u2194' -> "harr";// ↔
            case '\u21b5' -> "crarr";// ↵
            case '\u2308' -> "lceil";// ⌈
            case '\u2309' -> "rceil";// ⌉
            case '\u230a' -> "lfloor";// ⌊
            case '\u230b' -> "rfloor";// ⌋
            case '\u25ca' -> "loz";// ◊
            case '\u2660' -> "spades";// ♠
            case '\u2663' -> "clubs";// ♣
            case '\u2665' -> "hearts";// ♥
            case '\u2666' -> "diams";// ♦
            default -> null;
        };
    }
}
